####### INITIALIZE

### Load packages 
require(data.table)
require(bit64)
require(ggplot2)
require(tseries)
require(fUnitRoots)
require(dplyr)
require(forcats)

### Set directories
home.dir <-  "path-to-data-appendix/3_Analysis_of_TAQ_data"
data.dir <- file.path(home.dir, 'WRDS_Server/Output')
temp.dir <- file.path(home.dir, 'Final_Output/Tables')
fig.dir <- file.path(home.dir, 'Final_Output/Figures')

######## Functions
# Write function to calculate R2

# NOTE: Exchange code 'Q' represents "NASDAQ (Tape C)". We have aggregated all the NASDAQ observations into exchange code "T" in the SAS files.

# Function to get exchange names
getNames <- function(x){
  name <- "NA"
  if(x == "A") name <- "NYSE Market"
  if(x == "B") name <- "Nasdaq (BX)"
  if(x == "N") name <- "NYSE"
  if(x == "B") name <- "Nasdaq (BX)"
  if(x == "P") name <- "NYSE Arca"
  if(x == "C") name <- "NSX"
  if(x == "T") name <- "Nasdaq"
  if(x == "Q") name <- "Nasdaq"
  if(x == "D") name <- "FINRA"
  if(x == "X") name <- "Nasdaq (PSX)"
  if(x == "M") name <- "CHX"
  if(x == "Z") name <- "Bats BZX"
  if(x == "Y") name <- "Bats BYX"
  if(x == "K") name <- "Bats EDGX"
  if(x == "J") name <- "Bats EDGA"
  if(x == "W") name <- "CBOE"
  return(name)
}
getNames <- Vectorize(getNames)

# NOTE: Exchange abbreviation 'NSDQ2' represents "NASDAQ (Tape C)". We have aggregated all the NASDAQ observations into exchange code "T", or abbreviation
# 'NSDQ', in the SAS files.

# Function to get exchange abbreviations
getAbbr <- function(x){
  name <- "NA"
  if(x == "A") name <- "AMEX"
  if(x == "B") name <- "BX"
  if(x == "N") name <- "NYSE"
  if(x == "P") name <- "ARCA"
  if(x == "C") name <- "NSX"
  if(x == "T") name <- "NSDQ"
  if(x == "Q") name <- "NSDQ2"
  if(x == "D") name <- "FINRA"
  if(x == "X") name <- "PSX"
  if(x == "M") name <- "CHX"
  if(x == "Z") name <- "BZX"
  if(x == "Y") name <- "BYX"
  if(x == "K") name <- "EDGX"
  if(x == "J") name <- "EDGA"
  if(x == "W") name <- "CBOE"
  if(x == "V") name <- "IEX"
  if(x == "I") name <- "ISE"
  return(name)
}
getAbbr <- Vectorize(getAbbr)



######## ANALYZE

###### s_ijt regressions

## Load data
df <- read.table(file.path(data.dir, "Market_Shares_2015.txt"), sep = "|",
                 stringsAsFactors = FALSE, header = TRUE)
## Clean data

# Convert to data table
df <- data.table(df)

#
# df[ , SYM_ROOT := sym_root]
# df[ , SYM_SUFFIX := sym_suffix]
# df[ , EX := ex]

# Clean data
df[ , SYM_SUFFIX := ifelse(is.na(SYM_SUFFIX), "", SYM_SUFFIX)]
df[ , SYMBOL     := trimws(paste(SYM_ROOT, SYM_SUFFIX))]
df[ , DATE       := as.Date(as.character(DATE), format = "%Y%m%d")]

# Keep Sample 1 Only
df <- df[df$f_Sample_1 == 1,]

#________________________________ Get Symbol-Exchange Average Figures in 2015 ________________________________

# Calculate sij_bar
agg <- aggregate(Sh_S_Standard ~ SYM_ROOT + SYM_SUFFIX + EX, df, mean, na.rm = TRUE)
colnames(agg) <- c("SYM_ROOT", "SYM_SUFFIX", "EX", "sij_bar")
# 
# df$sij_bar = df$Sh_S_Standard
# agg = df[,c('SYM_ROOT', 'SYM_SUFFIX', 'EX', 'sij_bar')]

# Extract results
results <- unique(df[,c("SYM_ROOT", "SYM_SUFFIX", "SYMBOL", "f_Sample_1", "f_Sample_1_T100", "f_NYSE_Listed", "EX"), with = FALSE])
results <- merge(results, agg, by = c("SYM_ROOT", "SYM_SUFFIX", "EX"), all.x = TRUE)

# Save results
save(results, file = file.path(temp.dir, "sij_bar_results.Rdata"))

# Re-load dataset
load(file.path(temp.dir, "sij_bar_results.Rdata"))

# Prepare plots
ex.order <- c("T", "P", "N", "Z", "K", "J", "Y", "X", "B","A","M", "C", "W")
results  <- results[results$EX %in% ex.order]
ex.order <- getAbbr(ex.order)
results$EX <- getAbbr(results$EX)
results$EX <- factor(results$EX, levels = ex.order)

results$NYSE <- "Non-NYSE Listed Securities"
results$NYSE[results$f_NYSE_Listed == 1] <- "NYSE-Listed Securities"
results$NYSE <- factor(results$NYSE, levels = c("NYSE-Listed Securities", "Non-NYSE Listed Securities"))

# Reorder columns

results <-results %>%
  mutate(EX = fct_relevel(EX, "NYSE","NSDQ", "ARCA", "BZX", "EDGX", "EDGA", "BYX", "BX", "PSX", "AMEX", "CHX", "NSX", "CBOE"))

results <- data.table(results)

# Look at the largest y values, to determine a good ylim for the plots

max(results$sij_bar)
max(results[results$f_Sample_1_T100==1]$sij_bar)


# (1). Plot NYSE
p1 <- ggplot(results[results$f_Sample_1_T100 == 1 & NYSE == 'NYSE-Listed Securities',], aes(EX, sij_bar))
p1 <- p1 + geom_boxplot()
p1 <- p1 + coord_cartesian(ylim = c(0, 0.425))
p1 <- p1 + labs(x = "",
                y = "Mean Market Share")
# p <- p + facet_wrap(~ NYSE, ncol = 2)
p1 <- p1 + theme(panel.spacing = unit(1, "lines"),
                 strip.text.x = element_text(size = 12))
p1
ggsave(file = file.path(fig.dir, 'sij_bar_2015_t100_NYSE_only.png'), width = 6, height = 4.5, units = "in")

# (2). Plot non-NYSE
p2 <- ggplot(results[results$f_Sample_1_T100 == 1 & NYSE == 'Non-NYSE Listed Securities',], aes(EX, sij_bar))
p2 <- p2 + geom_boxplot()
p2 <- p2 + coord_cartesian(ylim = c(0, 0.425))
p2 <- p2 + labs(x = "",
                y = "Mean Market Share")
# p <- p + facet_wrap(~ NYSE, ncol = 2)
p2 <- p2 + theme(panel.spacing = unit(1, "lines"),
                 strip.text.x = element_text(size = 12))
p2
ggsave(file = file.path(fig.dir, 'sij_bar_2015_t100_non_NYSE.png'), width = 6, height = 4.5, units = "in")


# (3). Plot NYSE vs. Non-NYSE
p3 <- ggplot(results[results$f_Sample_1_T100 == 1,], aes(EX, sij_bar))
p3 <- p3 + geom_boxplot()
p3 <- p3 + coord_cartesian(ylim = c(0, 0.425))
p3 <- p3 + labs(x = "",
                y = "Mean Market Share")
p3 <- p3 + facet_wrap(~ NYSE, ncol = 2)
p3 <- p3 + theme(panel.spacing = unit(1, "lines"),
                 strip.text.x = element_text(size = 12))
p3
ggsave(file = file.path(fig.dir, 'sij_bar_2015_t100_NYSE_non_NYSE.png'), width = 12, height = 4.5, units = "in")

